In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

train_filename='dataset/sst_train.txt'
test_filename='dataset/sst_test.txt'
train_df = pd.read_csv(train_filename, sep='\t', header=None, names=['sentiment', 'text'])
test_df =pd.read_csv(test_filename, sep='\t', header=None, names=['sentiment', 'text'])
train_df['sentiment'] = train_df['sentiment'].str.replace('__label__', '')
train_df['sentiment'] = train_df['sentiment'].astype(int).astype('category')
test_df['sentiment'] = test_df['sentiment'].str.replace('__label__', '')
test_df['sentiment'] = test_df['sentiment'].astype(int).astype('category')
cols=train_df.columns
data = pd.concat([train_df,test_df], ignore_index=True)
data


Unnamed: 0,sentiment,text
0,4,The Rock is destined to be the 21st Century 's...
1,5,The gorgeously elaborate continuation of `` Th...
2,4,Singer/composer Bryan Adams contributes a slew...
3,3,You 'd think by now America would have had eno...
4,4,Yet the act is still charming here .
...,...,...
10749,4,An imaginative comedy/thriller .
10750,5,"( A ) rare , beautiful film ."
10751,5,( An ) hilarious romantic comedy .
10752,4,Never ( sinks ) into exploitation .


In [14]:
import re
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, Input, Dropout
from keras import Sequential

def remove_punct(text): 
    # punctuation marks 
    punctuations = '''!()-[]{};:'"\,<>./?@#$%^&*_~'''
  
    for x in text.lower(): 
        if x in punctuations: 
            text = text.replace(x, "") 
  
    return text


def remove_urls(text):
    #Remove HyperText Links
    text = re.sub(r'^https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^http?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'^ftp?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    
    return text

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)

# Clean Text
data["text"] = data.text.map(str) \
                            .map(lambda x: x.lower()) \
                            .map(lambda x: x.strip()) \
                            .map(lambda x: re.sub(r'\d+', '', x)) \
                            .map(remove_punct) \
                            .map(remove_urls) \
                            .map(remove_html_tags)

# Convert sentiment to int
# sentiment_map = {"1"="positif", "2"="negatif"}
# data["sentiment"] = data.sentiment.map(lambda x: sentiment_map[x])

In [15]:
def create_model(optimizer="adam", dropout=0.1, init='uniform', nbr_features=2500, dense_nparams=256):
    model = Sequential()
    model.add(Dense(dense_nparams, activation='relu', input_shape=(nbr_features,), kernel_initializer=init,)) 
    model.add(Dropout(dropout), )
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=["accuracy"])
    return model

In [16]:
kears_estimator = KerasClassifier(build_fn=create_model, verbose=1)

In [17]:
estimator = Pipeline([("tfidf", TfidfVectorizer(analyzer ="word", 
                                                max_features=2500,)), 
                       ('ss', StandardScaler(with_mean=False,)), 
                       ("kc", kears_estimator)])

In [18]:
# define the grid search parameters
param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2), (2,2), (1,3)],
    'tfidf__use_idf': [True, False],
    'kc__epochs': [10, 100, ],
    'kc__dense_nparams': [32, 256, 512],
    'kc__init': [ 'uniform', 'zeros', 'normal', ], 
    'kc__batch_size':[2, 16, 32],
    'kc__optimizer':['RMSprop', 'Adam', 'Adamax', 'sgd'],
    'kc__dropout': [0.5, 0.4, 0.3, 0.2, 0.1, 0]
}

In [19]:
X = data.text
y = data.sentiment
kfold_splits = 5
grid = GridSearchCV(estimator=estimator,  
                    n_jobs=-1, 
                    verbose=1,
                    return_train_score=True,
                    cv=kfold_splits,  #StratifiedKFold(n_splits=kfold_splits, shuffle=True)
                    param_grid=param_grid,)

In [None]:
grid_result = grid.fit(X, y, ) #callbacks=[tbCallBack]

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Fitting 5 folds for each of 10368 candidates, totalling 51840 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


In [None]:
def create_model(dense_layer_sizes, optimizer="adam", dropout=0.1, init='uniform', nbr_features=2500, dense_nparams=256):
    model = Sequential()
    model.add(Dense(dense_nparams, activation='relu', input_shape=(nbr_features,), kernel_initializer=init,)) 
    model.add(Dropout(dropout), )
    for layer_size in dense_layer_sizes:
        model.add(Dense(layer_size, activation='relu'))
        model.add(Dropout(dropout), )
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=optimizer,metrics=["accuracy"])
    return model