# Unbabel Challenge

0 - Machine translation
1 - Human

In [2]:
import nltk
#nltk.download()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn

%matplotlib inline

In [3]:
debug = False

## Part 1: Data

### Load dataset

In [4]:
messages = pd.read_table("data/training.txt", header=None, names=['label', 'message'])

In [146]:
if debug:
    print(messages.head())
print(messages.shape)

(16892, 2)


## Part 2: Basic Exploratory Data Analysis

In [49]:
#messages.groupby('label').describe()

In [50]:
messages['length'] = messages['message'].apply(len)
if debug:
    print(messages.head())

Let's visualize this!

In [51]:
#messages['length'].plot(bins=50, kind='hist')

In [52]:
#messages.length.describe()

In [53]:
#messages[messages['length'] == 6353]['message'].iloc[0]

In [54]:
#messages.hist(column='length', by='label', bins=50,figsize=(10,4))

## Part 3: Text pre-processing 

In [5]:
import string
from nltk.corpus import stopwords
#stopwords.words('spanish')[0:10] # Show some stop words

In [6]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    #In Python, searching a set is much faster than searching
    #   a list, so convert the stop words to a set
    #stops = set(stopwords.words("spanish"))  
    
    # Now just remove any stopwords
    #return [word.lower() for word in nopunc.split() if word.lower() not in stops]
    return [word.lower() for word in nopunc.split()]

In [7]:
# Check to make sure its working
processed_sentences = messages['message'].apply(text_process)

In [58]:
if debug:
    messages['message'].head()

In [59]:
#print(processed_sentences)

## Part 4: Features creation 

In [8]:
def pos_features(sentence, sentence_pos, i):
    features = {}

    #features["word"] = sentence[i]
    features["word-pos"] = sentence_pos[i][1]

    if i == 0:
     #   features["prev-word"] = "<START>"
        features["prev-word-pos"] = "<START>"
    else:
      #  features["prev-word"] = sentence[i-1]
        features["prev-word-pos"] = sentence_pos[i-1][1]

    if i == len(sentence) - 1:
       # features["next-word"] = "<END>"
        features["next-word-pos"] = "<END>"
    else:
        #features["next-word"] = sentence[i+1]
        features["next-word-pos"] = sentence_pos[i+1][1]

    return features

In [9]:
if debug:
    print(processed_sentences.values)

In [10]:

labels = messages['label']
if debug:
    labels.head()

In [11]:
if debug:
    print(processed_sentences.shape)

In [239]:

def create_features(sentences, labels):
    features = []
    for index, sentence in sentences.iteritems():
        sentence_pos = nltk.pos_tag(sentence)
        for i, word in enumerate(sentence):
            #print(sentence, word, i)
            features.append((pos_features(sentence, sentence_pos, i), labels[index], index))
    return features

In [99]:
features = create_features(processed_sentences, labels)

In [100]:

if debug:
    print(features[:10])
print(features[:10])

[({'word-pos': 'NN', 'prev-word-pos': '<START>', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'NN', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1), ({'word-pos': 'FW', 'prev-word-pos': 'FW', 'next-word-pos': 'FW'}, 1)]


In [67]:
#from nltk.tag import pos_tag, pos_tag_sents
#processed_sentences_postag = pos_tag_sents(processed_sentences)
#print(processed_sentences_postag)
#for sentence in processed_sentences:
    #print(list(sentence))
#    print(pos_tag(sentence))

In [75]:
from sklearn.feature_extraction import DictVectorizer
 
vec = DictVectorizer(sparse=False)
X = vec.fit_transform([item[0] for item in features])

In [101]:
print(X.shape)

(431537, 101)


In [16]:
#print(len(X))
 
#print(len(X[0]))
 
vec.get_feature_names()[1:15]

["next-word-pos=''",
 'next-word-pos=<END>',
 'next-word-pos=CC',
 'next-word-pos=CD',
 'next-word-pos=DT',
 'next-word-pos=EX',
 'next-word-pos=FW',
 'next-word-pos=IN',
 'next-word-pos=JJ',
 'next-word-pos=JJR',
 'next-word-pos=JJS',
 'next-word-pos=MD',
 'next-word-pos=NN',
 'next-word-pos=NNP']

In [148]:
Y = [item[1] for item in features]

if debug:
    print(Y[:10])

## Part 5: Split dataset (train and test)

The test size is 20% of the entire dataset, and the training is the rest. Note the default split would have been 30/70.

In [149]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, train_size=0.80)

In [150]:
print(len(Y_train))

345229


In [151]:
#if debug:
print('X_train: ' + str(X_train.shape))
print('X_test: ' + str(X_test.shape))
print('Y_train: ' + str(len(Y_train)))
print('Y_test: ' + str(len(Y_test)))

X_train: (345229, 101)
X_test: (86308, 101)
Y_train: 345229
Y_test: 86308


## Part 6: Training a model

In [32]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, Y_train) 

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [33]:
clf.support_vectors_

<340202x101 sparse matrix of type '<class 'numpy.float64'>'
	with 1020606 stored elements in Compressed Sparse Row format>

In [35]:
import pickle
# now you can save it to a file
with open('SVM_filename_1.pkl', 'wb') as f:
    pickle.dump(clf, f)

# and later you can load it
#with open('filename.pkl', 'rb') as f:
#    clf = pickle.load(f)

In [34]:
print("Training the random forest...")
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest = forest.fit(X_train, Y_train)

Training the random forest...


In [36]:
with open('RandomForest_filename_1.pkl', 'wb') as f:
    pickle.dump(forest, f)

In [None]:
#from nltk.chunk import ne_chunk_sents
#chunks = ne_chunk_sents(processed_sentences_postag)

In [None]:
#chunks

In [None]:
from nltk.tag.sequential import ClassifierBasedPOSTagger
tagger = ClassifierBasedPOSTagger(train=list(processed_sentences_postag))
tagger.evaluate(test_sents)


In [None]:
#def process_pos()

In [None]:
#from sklearn.naive_bayes import MultinomialNB
#human_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])

Let's try classifying our single random message and checking how we do:

In [None]:
#if debug:
#    print('predicted:', human_detect_model.predict(tfidf4)[0])
#    print('expected:', messages.label[3])

### Using Random Forest

In [None]:
#print("Training the random forest...")
#from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
#forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
#forest = forest.fit(messages_tfidf, messages["label"])

## Part 7: Model Evaluation
Now we want to determine how well our model will do overall on the entire dataset. Let's beginby getting all the predictions:

In [None]:
#all_predictions = human_detect_model.predict(messages_tfidf)
#if debug:
#    print(all_predictions)

We can use SciKit Learn's built-in classification report, which returns precision, recall, f1-score, and a column for support (meaning how many cases supported that classification). 

In [None]:
#from sklearn.metrics import classification_report
#if debug:
#    print(classification_report(messages['label'], all_predictions))

In the above "evaluation",we evaluated accuracy on the same data we used for training. You should never actually evaluate on the same dataset you train on!


A proper way is to split the data into a training/test set, where the model only ever sees the training data during its model fitting and parameter tuning. The test data is never used in any way. This is then our final evaluation on test data is representative of true predictive performance.

### Random forest evaluation

In [None]:
#all_predictions_forest = forest.predict(messages_tfidf)
#if debug:
#    print(all_predictions_forest)
#    print(classification_report(messages['label'], all_predictions_forest))

## Part 8: Creating a Data Pipeline

In [None]:
from sklearn.pipeline import Pipeline

#pipeline = Pipeline([
#    ('bow', CountVectorizer(analyzer=text_process)),  # strings to token integer counts
#    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
#    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
#])

In [None]:
#pipeline.fit(msg_train,label_train)

In [None]:
#predictions = pipeline.predict(msg_test)

In [None]:
#if debug:
#    print(classification_report(predictions,label_test))

# SVM

In [256]:
import pickle
with open('SVM_filename_1.pkl', 'rb') as fid:
    clf2 = pickle.load(fid)


In [76]:
clf2.score(X_test, Y_test)

0.50716040228020576

In [38]:
clf2.predict(X_test)

array([0, 0, 0, ..., 1, 0, 0])

# Random Forest

In [20]:
with open('RandomForest_filename_1.pkl', 'rb') as fid:
    rndforest = pickle.load(fid)

In [238]:
#rndforest.score(X_test, Y_test)

In [237]:
#rndforest.predict(X_test)

In [196]:
#test_blind = pd.read_csv("data/test_blind.txt", sep='\t', header=None, names=['label', 'message'],encoding='utf-8')

In [215]:
file = open('data/test_blind.txt', 'r')
lines = {}
i=0
for line in file:
    parts = line.strip().split("\t")
    lines[i] = {'label': parts[0], 'message': parts[1]}
    i += 1
#print(lines)

In [216]:
test_blind = pd.DataFrame.from_dict(lines, orient='index')

print(test_blind.shape)

(3220, 2)


In [257]:
#test_blind

In [218]:
print(test_blind.shape)

(3220, 2)


In [219]:
test_processed_sentences = test_blind['message'].apply(text_process)
test_labels = test_blind['label'] 

In [220]:
print(test_processed_sentences.shape)

(3220,)


In [240]:
test_features = create_features(test_processed_sentences, test_labels)

In [241]:
print(len(test_features))

67703


In [242]:
test_Y = vec.transform([item[0] for item in test_features])

In [243]:
print(test_Y.shape)

(67703, 101)


# PREDICTIONS

In [258]:
#RANDOM FOREST
#predicts_rnd = rndforest.predict(test_Y)
# SVM
predicts_rnd = clf2.predict(test_Y)

In [259]:
len(predicts_rnd)

67703

In [260]:
#print(vec.get_feature_names())

In [261]:
n_sentences = [item[2] for item in test_features]

In [262]:
predictions_ = zip(n_sentences, predicts_rnd)

In [263]:

import collections
sentences_test = collections.defaultdict(list)
for i_sentence, label in predictions_:
    sentences_test[i_sentence].append(label)

In [264]:
import numpy as np
results = []
for k,v in sentences_test.items():
    ones = np.count_nonzero(v)
    total = len(v)
    #print("%s - %s - %s" % (str(k), str(ones), str(total)))
    if (total - ones > (total/2)):
        results.append(0)
    else:
        results.append(1)

In [265]:
#for r in results:
#    print(r)

In [266]:
results_test = pd.DataFrame()
results_test['message'] = test_blind['message']

In [267]:
results_test['label'] =results

In [268]:
#results_test

In [269]:
results_test.to_csv('results_RuiMendes_svm.txt', sep='\t', columns = ['label', 'message'], index=False, index_label=False)

In [270]:
results_test.describe()

Unnamed: 0,label
count,3220.0
mean,0.023602
std,0.151831
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,1.0


In [179]:
results_test.to_csv('results_RuiMendes_2.txt',index=False, )

In [181]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')

In [187]:
print(test_blind)

     label                                            message
0        _  Bougainvillea en la floración a pesar de el dí...
1        _  El número de servicios de Internet de los últi...
2        _  ¡ Apostamos nuestro dinero a que él hace un gr...
3        _  Este loco Talentosos impresionista Sings conge...
4        _  Nuestra Niñera Rompio con Nosotros con un Mens...
5        _  La multitud se puso un poco mejor con cada art...
6        _  No hubo alteración real de quién era yo , de l...
7        _  Como Zuckerberg ha prometido , Oculus VR comen...
8        _  Todo , desde el modelo básico hasta el más ava...
9        _  Al hacer que el robot sea más como nosotros , ...
10       _  Otra gran debilidad del programa es que no tod...
11       _  Para celebrar el regreso de mi avatar a la pri...
12       _  Se cree que Lea Michele ha sido una diva total...
13       _  Cuando le expliqué a Marlo que no te gustan lo...
14       _  Y aunque sus abs son un espectáculo para la vi...
15      

In [188]:
results_test_2 = test_blind
results_test_2['label'] = results
print(results_test_2)

      label                                            message
0         0  Bougainvillea en la floración a pesar de el dí...
1         0  El número de servicios de Internet de los últi...
2         0  ¡ Apostamos nuestro dinero a que él hace un gr...
3         0  Este loco Talentosos impresionista Sings conge...
4         0  Nuestra Niñera Rompio con Nosotros con un Mens...
5         1  La multitud se puso un poco mejor con cada art...
6         0  No hubo alteración real de quién era yo , de l...
7         0  Como Zuckerberg ha prometido , Oculus VR comen...
8         0  Todo , desde el modelo básico hasta el más ava...
9         0  Al hacer que el robot sea más como nosotros , ...
10        0  Otra gran debilidad del programa es que no tod...
11        0  Para celebrar el regreso de mi avatar a la pri...
12        0  Se cree que Lea Michele ha sido una diva total...
13        0  Cuando le expliqué a Marlo que no te gustan lo...
14        0  Y aunque sus abs son un espectáculo para l

In [189]:
results_test_2.to_csv('results_RuiMendes_2.txt',index=False, )