In [None]:
import theano
import lasagne
import pandas as pd
import numpy as np

from nltk.corpus import stopwords
import nltk.data

from sklearn.preprocessing import StandardScaler,MinMaxScaler
from nolearn.lasagne import NeuralNet
from lasagne.updates import nesterov_momentum
from nolearn.lasagne import BatchIterator
from lasagne.layers import InputLayer, Conv2DLayer, DropoutLayer,\
    MaxPool2DLayer, DenseLayer
from lasagne.nonlinearities import softmax, sigmoid
from sklearn.cross_validation import train_test_split

from utils import FactorizeActors, ProcessIMDBData

In [None]:
df = ProcessIMDBData('data/imdb.csv', ['genres', 'movie_imdb_link', 'color'])()

In [None]:
y = df['imdb_score']
df.drop('imdb_score', axis=1, inplace=True)

In [None]:
xs = x = df.values
ys = y

## Matriz de Correlación Full

In [None]:
import matplotlib.pyplot as plt
a = df.corr()

df_corr = df[a.columns.tolist()]

plt.matshow(df_corr.corr())
plt.xticks(np.arange(0, len(df_corr.columns)), df_corr.columns.tolist(), rotation="vertical")
plt.yticks(np.arange(0, len(df_corr.columns)), df_corr.columns.tolist())
plt.colorbar()
plt.show()

# PCA

In [None]:
from sklearn.decomposition import RandomizedPCA # using randomized Singular Value Decomposition 
Xp = RandomizedPCA(n_components=35, random_state=1)
Xp = Xp.fit_transform(xs)

In [None]:
#Xp.explained_variance_ratio_.cumsum()

In [None]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(xs, ys,random_state=1)

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn import metrics


def plot_matrix(clf, X_test, y_test):
    plt.clf()
    plt.imshow(confusion_matrix(clf.predict(X_test), y_test),
               interpolation='nearest', cmap=plt.cm.Blues)
    plt.colorbar()
    plt.xlabel("true label")
    plt.ylabel("predicted label")
    plt.show()

In [None]:
clf = ExtraTreesClassifier(n_estimators=300,
                           max_features=0.2, 
                           n_jobs=2,
                           max_depth=None,
                           min_samples_split=1,
                           random_state=1).fit(X_train, y_train)
print(classification_report(y_test, clf.predict(X_test)))
print("Score over Testing Data {}".format(clf.score(X_test, y_test)))
print("Score over Training Data {}".format(clf.score(X_train, y_train)))
plot_matrix(clf, X_test, y_test)

In [None]:
#Se guardan scores y labels verdaderos del ExtraTreesClassifier
y_true_tree = y_test
scores_tree = clf.predict(X_test)

In [None]:
importances = clf.feature_importances_
cols = len(df.columns)

text = list(map(lambda i: df.columns[i], range(cols)))
plt.figure(figsize=(20,cols))
print(importances[::-1].shape)
plt.bar(range(cols),height=importances,  width=1.)
plt.xticks(np.arange(0.5, cols, 1.), text, rotation=90)
plt.xlim((0, cols))
plt.show()

In [None]:
confusion_matrix(y_test, clf.predict(X_test))

## Dummy

In [None]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier(random_state=0).fit(X_train, y_train)
#print clf.score(X_test, y_test)
plot_matrix(clf, X_test, y_test)
clf.score(X_test, y_test)
print(classification_report(y_test, clf.predict(X_test)))

In [None]:
confusion_matrix(y_test, clf.predict(X_test))

## SVM

In [None]:
from sklearn.svm import SVC

sv = SVC(kernel='rbf', cache_size=1000)
sv.fit(X_train, y_train)

print(classification_report(y_test, sv.predict(X_test)))
print(sv.score(X_test, y_test))
plot_matrix(sv, X_test, y_test)



In [None]:
confusion_matrix(y_test, sv.predict(X_test))

In [None]:
y_true_svm = y_test
scores_svm = sv.predict(X_test)

# Red Neuronal

In [None]:
from lasagne.nonlinearities import linear, tanh, rectify
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
xs = x
ys = np.array(y)

In [None]:
std_x = MinMaxScaler([-1, 1])
xs = std_x.fit_transform(np.array(xs))
std_y = MinMaxScaler([0, 2])
ys = std_y.fit_transform(np.array(ys))

X_train, X_test, y_train, y_test = train_test_split(xs, ys,random_state=1)

In [None]:
layers_0 = [
                (InputLayer, {'shape': (None, 50)}),
                (DenseLayer, {'num_units': 512}),
                (DropoutLayer, {'p': 0.5}),
                (DenseLayer, {'num_units': 512}),
                (DropoutLayer, {'p': 0.5}),
                (DenseLayer, {'num_units': 3, 'nonlinearity': softmax}),
        ]

In [None]:
class AdjustVariable(object):
    """
    Used to decreases linearly the learning rate with the number of epochs,
    while we the momentum increase.
    """
    def __init__(self, name, start=0.03, stop=0.001):
        self.name = name
        self.start, self.stop = start, stop
        self.ls = None

    def __call__(self, nn, train_history):
        if self.ls is None:
            self.ls = np.linspace(self.start, self.stop, nn.max_epochs)

        epoch = train_history[-1]['epoch']
        new_value = np.float32(self.ls[epoch - 1])
        getattr(nn, self.name).set_value(new_value)

In [None]:
def create_network(npochs=50, batch_s=10000):
    return NeuralNet(
        layers=layers_0,
        update=nesterov_momentum,
        update_learning_rate=theano.shared(np.float32(0.009)),
        update_momentum=theano.shared(np.float32(0.9)),

        regression=False,
        batch_iterator_train=BatchIterator(batch_size=batch_s),
        on_epoch_finished=[
            AdjustVariable('update_learning_rate', start=0.09, stop=0.009),
            AdjustVariable('update_momentum', start=0.9, stop=0.9999)
        ],
        max_epochs=npochs,
        verbose=1)

#net0 = create_network(10)

In [None]:
net0 = create_network(60)

In [None]:
X_train = X_train.astype(np.float32)
y_train = y_train.astype(np.int32)

net0.fit(X_train, y_train)

In [None]:
#%matplotlib inline
def plot_loss_net(net0):
    plt.clf()
    plt.figure(figsize=(15,5))

    train_loss = np.array([i["train_loss"] for i in net0.train_history_])
    valid_loss = np.array([i["valid_loss"] for i in net0.train_history_])
    plt.plot(train_loss, '--b', linewidth=2, label="{} train".format("net0"))
    plt.plot(valid_loss, '-b', linewidth=2, label="{} valid".format("net0"))

    plt.grid()
    plt.legend()
    plt.xlabel("epoch")
    plt.ylabel("loss")
    plt.yscale("log")
    plt.show()

In [None]:
plot_loss_net(net0)

In [None]:
X_test = X_test.astype(np.float32)
y_test = y_test.astype(np.int32)

print(classification_report(y_test, net0.predict(X_test)))
plot_matrix(net0, X_test, y_test)

In [None]:
confusion_matrix(y_test, net0.predict(X_test))

### Curva ROC 

In [None]:

def get_color_plt(i):
    color = ""
    if i == 0:
        color = 'darkorange'
    elif i == 1:
        color = 'blue'
    elif i== 2: 
        color = 'green'
    elif i == 3:
        color = 'red'
    return color

# Ploteo para cada una de las clases
def plotear_grafico_roc(lista_fpr,lista_tpr,lista_nombres):
    plt.figure()
    lw = 2
    for i in range(0,len(lista_fpr)):
        # Compute ROC curve and ROC area for each class
        roc_auc = dict()
        print (lista_fpr[i])
        roc_auc[0] = auc(lista_fpr[i], lista_tpr[i])
        color = get_color_plt(i)
        plt.plot(lista_fpr[i], lista_tpr[i], color=color,
                 lw=lw,
                 label='ROC curve (%s, area = %0.2f)' % (lista_nombres[i],roc_auc[0]))
        plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic example')
    plt.legend(loc="lower right")
    plt.show()


In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import label_binarize

y_true_net = y_test
scores_net = net0.predict(X_test)
fpr_net, tpr_net, thresholds = roc_curve(y_true_net, scores_net, pos_label=2)
fpr_svm, tpr_svm, thresholds = roc_curve(y_true_svm, scores_svm, pos_label=2)
fpr_tree, tpr_tree, thresholds = roc_curve(y_true_tree, scores_tree, pos_label=2)


plotear_grafico_roc([fpr_net,fpr_svm,fpr_tree],[tpr_net,tpr_svm,tpr_tree],['ANN','SVC','TreeClassifier'])