In [None]:
import pandas as pd
import numpy as np
import gensim
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier

In [None]:
train = pd.read_csv('train.csv')
print(len(train))
train[0:10]

In [None]:
test = pd.read_csv('test.csv')
print(len(test))
test[0:10]

In [None]:
EAP = train[train['author']=='EAP'].reset_index(drop=True)
print("Size of Edgar Allan Poe dataset = {}".format(len(EAP)))
print("% of Edgar Allan Poe dataset = {0:.03f}".format(len(EAP)/len(train)))
EAP[0:10]

In [None]:
HPL = train[train['author']=='HPL'].reset_index(drop=True)
print("Size of HP Lovercraft dataset = {}".format(len(HPL)))
print("% of HP Lovercraft dataset = {0:.03f}".format(len(HPL)/len(train)))
HPL[0:10]

In [None]:
MWS = train[train['author']=='MWS'].reset_index(drop=True)
print("Size of Mary Shelley dataset = {}".format(len(MWS)))
print("% of Marry Shelley dataset = {0:.03f}".format(len(MWS)/len(train)))
MWS[0:10]

In [None]:
label_enconder = preprocessing.LabelEncoder()
label_enconder.fit(train['author'])
train['label_encoded'] = label_enconder.transform(train['author'])
train[0:20]

## Pre-processing text

In [None]:
def lower_case(text):
    return text.lower()

In [None]:
def remove_punkt(text):
    import nltk

In [None]:
train['text_processed']=train['text'].apply(lambda x: lower_case(x))
train

## Train_test split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(train['text_processed'], train['label_encoded'], test_size = 0.2, random_state = 4)
true_label = np.array(y_test)
print("#" * 20 + " Some stats " + "#"*20)
print("Dataset training: {} uterances".format(x_train.shape[0]))
print("Dataset testing: {} uterances".format(x_test.shape[0]))
print("Different classes: {}".format(len(y_train.unique())))

In [None]:
features = TfidfVectorizer(
                        max_df = 0.5,
                        stop_words = 'english')

In [None]:
x_train_features = features.fit_transform(x_train)
x_test_features = features.transform(x_test)

In [None]:
model_sgd = SGDClassifier(penalty = 'l2', loss = 'log', class_weight = 'balanced')
model_sgd.fit(x_train_features,y_train)

In [None]:
preds_sgd = model_sgd.predict(x_test_features)
print("Current Accuracy: {0:.3f}".format(accuracy_score(preds_sgd,true_label)))

### Making predictions

In [None]:
def get_txt_proba_response(msg, vectors, model):
    msg_vec = vectors.transform([msg])
#    print(msg_vec)
    pred_prob=model.predict_proba(msg_vec)
    pd_unsorted = pd.DataFrame(
        {'label_encode': model.classes_,
         'label_decode': label_enconder.inverse_transform(model.classes_),
         'pred_proba':  pred_prob[0]})
    
    probas = {
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[0][0]:
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[0][2],
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[1][0]:
        pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[1][2],
         pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[2][0]:
         pd_unsorted.sort_values(['pred_proba'], ascending=False,axis=0).iloc[2][2]}
    
    return probas

In [None]:
msg = (test['text'][0]).lower()
print(msg)
msg_vec = features.transform([msg])
print(msg_vec)

In [None]:
get_txt_proba_response(msg,features, model_sgd)

In [None]:
for msg in range(0,10):
    text = test['text'][msg]
    print('Sentence {} - {}'.format(msg, text))
    print('Prediction = {} \n'.format(get_txt_proba_response(text,features, model_sgd)))

## Generating submissions

In [None]:
sample_submission = pd.read_csv('sample_submission.csv')
sample_submission[0:10]

In [None]:
my_sub = pd.DataFrame(columns={'id', 'EAP','HPL', 'MWS'})
my_sub=my_sub[['id', 'EAP','HPL', 'MWS']]
my_sub

In [None]:
reply = get_txt_proba_response(test['text'][1],features, model_sgd)
print(reply)
print(reply['EAP'])

In [None]:
for i in range(len(test)):
    reply=get_txt_proba_response(test['text'][i].lower(),features, model_sgd)
    #EAP.append(reply['EAP'])
    #HPL.append(reply['HPL'])
    #MWS.append(reply['MWS'])
    my_sub.loc[i] = [test['id'][i], reply['EAP'], reply['HPL'], reply['MWS']]

In [None]:
my_sub['EAP'][0:10]

In [None]:
my_sub.to_csv('roberto_2.csv',index=False)

## Testing other models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from pprint import pprint
from time import time

In [None]:
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
])

In [None]:
CountVectorizer()

In [None]:
parameters = {
    'vect__max_df': (0.1, 0.5, 0.75, 0.9, 1.0),
    'vect__max_features': (None, 5000, 7000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams,
    'vect__stop_words': (None, 'english'),
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': (0.00001, 0.000001),
    'clf__penalty': ('l1','l2', 'elasticnet'),
    'clf__n_iter': (10, 50, 80, 200),
    'clf__class_weight': (None, 'balanced')
}

In [None]:
y_train

In [None]:
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
print(parameters)
t0 = time()
grid_search.fit(x_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

In [None]:
def transformText(text):
    
    stops = set(stopwords.words("english"))
    
    # Convert text to lower
    text = text.lower()
    # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
    
    # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
    
    # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
    
    # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
    
    # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
    
    # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
    
    # Stemming
    return gensim.parsing.preprocessing.stem_text(text)