In [6]:
#!/usr/bin/env python3

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from preprocess import PreprocessNLP

import nltk
from nltk.corpus import wordnet
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, SnowballStemmer, WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

import lightgbm as lgbm

#Keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint

import seaborn as sns
import csv


train_file = 'data_train.pkl'
test_file  = 'data_test.pkl'

pd.set_option('display.max_colwidth', 500)
pd.set_option('display.max_rows', 500)

print("initializing")


print("loading pkl")
train = np.load(train_file, allow_pickle=True)
test  = np.load(test_file, allow_pickle=True)
inputs = np.asarray(train[0])
labels = np.asarray(train[1])

tfidf = TfidfVectorizer(sublinear_tf=True,norm='l2',min_df=2, encoding='latin-1', strip_accents = 'unicode')

unique_labels = np.unique(labels)

df = pd.DataFrame(
    {'Messages': inputs,
     'Labels': labels
    })

df_test = pd.DataFrame(
    {'Messages': test,
    })

def prep(inputs_df, X):
    _prep = PreprocessNLP( inputs_df, X )
    _prep.lower()
    _prep.tokenize()
    _prep.join()

    return _prep.data_frame

print("preprocessing train set")
df = prep( df, 'Messages' )
print("preprocessing test set")
df_test = prep( df_test, 'Messages')

# for models:
counts_tfidf = tfidf.fit_transform(df['Messages'])
counts_tfidf_test = tfidf.transform(df_test['Messages'])


#Some models from sklearn (after gridsearch)
classifiers = [
                # 'RandomForestClassifier',
                'VotingClassifier',
                # 'AdaBoostClassifier',
                # 'ComplementNB',
                # 'MultinomialNB',
                # 'BernoulliNB',
                # 'SGDClassifier',
                ]
models = {
    # 'RandomForestClassifier': RandomForestClassifier(),
    'VotingClassifier': VotingClassifier(  estimators =  [
                    ('cnb', ComplementNB(alpha=1.2)),
                    ('mnb', MultinomialNB(alpha = 0.25) ),
                    ('sgdc', SGDClassifier(loss = 'modified_huber', penalty = 'l2',
                                            max_iter=1000, tol=1e-3,
                                            learning_rate = 'optimal', random_state = 42 ) )
                        ]),
    # 'AdaBoostClassifier': AdaBoostClassifier(),
    # 'ComplementNB': ComplementNB(),
    # 'MultinomialNB': MultinomialNB(),
    # 'BernoulliNB': BernoulliNB(),
    # 'SGDClassifier': SGDClassifier(),

}

#best param grid
params = {
    # 'RandomForestClassifier': { 'n_estimators': [500], 'max_depth': [50] },  #resultat: 0.4295
    'VotingClassifier' : {
                        'voting': ['hard']
                            },
    # 'AdaBoostClassifier':  { 'base_estimator':
    #                     [
    #                     MultinomialNB(alpha=0.25),
    #                     ],
    #                     'n_estimators': [100], 'learning_rate': [1.0] },
    # 'ComplementNB': { 'alpha': [1.55]},
    # 'MultinomialNB': {'alpha' : [0.3]},
    # 'BernoulliNB': {'alpha' : [0.3]}
    # 'SGDClassifier':{ 'loss': ['modified_huber'], 'penalty': ['l2'], 'alpha' : [0.0001], "random_state": [42]}
}



#to compare them
# X_train, X_test, y_train, y_test = train_test_split(counts_tfidf,
#         df['Labels'], test_size=0.1, random_state=42)



preds = []
clfs = classifiers

for classifier in classifiers:
    clf = GridSearchCV(models[classifier], params[classifier], cv=10,
                            verbose = 1, scoring='accuracy')

    clf.fit(counts_tfidf, df["Labels"])

    print("Best hyper-parameters:")
    print(clf.best_params_)

    predicted = clf.predict(counts_tfidf_test)
    # pred_acc = np.sum([y_test == y_preds]) / len(y_test)
    # preds.append(pred_acc)

# print(list(zip(classifiers, preds)))

#Keras stuff
# df = pd.DataFrame(
#     {'Messages': inputs,
#      'Labels': labels
#     })
#
# df_test = pd.DataFrame(
#     {'Messages': test,
#     })
#
#
# one_hot = pd.get_dummies(df["Labels"])
# df.drop(['Labels'],axis=1,inplace=True)
# df = pd.concat([df,one_hot],axis=1)
#
# X_train, X_test, y_train, y_test = train_test_split(df["Messages"].values,
#         df.drop(['Messages'],axis=1).values, test_size=0.1, random_state=42)
#
#
#
# epochs = 10


#Transform inputs with keras Tokenizer
# num_words = 20000
# tokenizer = Tokenizer(num_words= num_words)
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(X_train)
# sequences = tokenizer.texts_to_sequences(X_train)
# num_words = len(tokenizer.word_index) + 1
# X_train = pad_sequences(sequences, maxlen=200)
# sequences = tokenizer.texts_to_sequences(X_test)
# X_test = pad_sequences(sequences, maxlen=200)
#
#
# model = Sequential()
# model.add(Embedding(num_words, 50, input_length=200))
# model.add(Flatten())
# model.add(Dense(20))
# model.add(Dropout(0.2))
# model.add(Activation('softmax'))
#
# callbacks = [EarlyStopping(monitor='val_loss', patience=2),
#          ModelCheckpoint(filepath='best_model.h5', monitor='val_loss', save_best_only=True)]
# model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# history = model.fit(X_train, y_train,
#                     batch_size=64,
#                     epochs=epochs,
#                     verbose=1,
#                     validation_split=0.1)
# fig = plt.figure()
# plt.plot(history.history['accuracy'])
# plt.plot(history.history['val_accuracy'])
# plt.title('Précision du réseau de neurones en fonction de l\'époque')
# plt.ylabel('Précision')
# plt.xlabel('Époque')
# ax = fig.gca()
# ax.set_xticks(np.arange(0, epochs, 1.))
# ax.set_yticks(np.arange(0, np.ceil(max(history.history['val_accuracy'])), 0.1))
# plt.grid()
# plt.legend(['Entraînement', 'Test'], loc='upper left')
# plt.show()
#
# fig = plt.figure()
# plt.plot(history.history['loss'])
# plt.plot(history.history['val_loss'])
# plt.title('Perte du réseau de neurones en fonction de l\'époque')
# plt.ylabel('Perte')
# plt.xlabel('Époque')
# ax = fig.gca()
# ax.set_xticks(np.arange(0, epochs, 1.))
# ax.set_yticks(np.arange(0, np.ceil(max(max(history.history['loss']), max(history.history['val_loss']))), 0.5))
# plt.grid()
# plt.legend(['Entraînement', 'Test'], loc='upper left')
# plt.show()


# score, acc = model.evaluate(X_test, y_test,
#                             batch_size=64)
# print(score, acc)


 ####Creating the csv submission
preds = []
preds_to_write=["Category"]
all_preds = []
ids=[]
ids.append("Id")
ids.extend(list(range(30000)))

for i in predicted:
    preds.append(i)

preds_to_write.extend(preds)

with open("predictions.csv", "w",newline='') as f:
        writer = csv.writer(f)
        writer.writerows(zip(ids,preds_to_write))


initializing
loading pkl
preprocessing train set
lower finished
tokenize done
join finished
preprocessing test set
lower finished
tokenize done
join finished
Fitting 10 folds for each of 1 candidates, totalling 10 fits
Best hyper-parameters:
{'voting': 'hard'}
