# Unsupervised Learning app. 

## Inserting the Dataset

In [7]:
from io import StringIO
import requests
import json
import pandas as pd

# @hidden_cell
# This function accesses a file in your Object Storage. The definition contains your credentials.
# You might want to remove those credentials before you share your notebook.
def get_object_storage_file_with_credentials_38ba68786cf84e1cbb5cc80f5a49fbed(container, filename):
    """This functions returns a StringIO object containing
    the file content from Bluemix Object Storage."""

    url1 = ''.join(['https://identity.open.softlayer.com', '/v3/auth/tokens'])
    data = {'auth': {'identity': {'methods': ['password'],
            'password': {'user': {'name': 'member_e83bb2ed70e300a3a681580fec6a7d42c6cd8eb2','domain': {'id': 'dd180082cfbb4aba93677ea2bbc9d7ff'},
            'password': 'y]333V2!.D=?Blh='}}}}}
    headers1 = {'Content-Type': 'application/json'}
    resp1 = requests.post(url=url1, data=json.dumps(data), headers=headers1)
    resp1_body = resp1.json()
    for e1 in resp1_body['token']['catalog']:
        if(e1['type']=='object-store'):
            for e2 in e1['endpoints']:
                        if(e2['interface']=='public'and e2['region']=='dallas'):
                            url2 = ''.join([e2['url'],'/', container, '/', filename])
    s_subject_token = resp1.headers['x-subject-token']
    headers2 = {'X-Auth-Token': s_subject_token, 'accept': 'application/json'}
    resp2 = requests.get(url=url2, headers=headers2)
    return StringIO(resp2.text)

# Your data file was loaded into a StringIO object and you can process the data.
# Please read the documentation of pandas to learn more about your possibilities to load your data.
# pandas documentation: http://pandas.pydata.org/pandas-docs/stable/io.html
df_data_1 = pd.read_csv(get_object_storage_file_with_credentials_38ba68786cf84e1cbb5cc80f5a49fbed('DefaultProjectankynotpankygmailcom', 'movieDBe.csv'), sep='\t',  header=None, quoting=3)
df_data_2 = pd.read_csv(get_object_storage_file_with_credentials_38ba68786cf84e1cbb5cc80f5a49fbed('DefaultProjectankynotpankygmailcom', 'seedb.csv'),  header=None)

In [8]:
import numpy as np
# insert your team seed number
team_seed_number = 64
my_index = df_data_2.iloc[team_seed_number,:].values
titles = df_data_1.iloc[:, [2]].values[my_index] # movie titles (string)
categories = df_data_1.iloc[:, [3]].values[my_index] # movie categories (string)
bins = df_data_1.iloc[:, [4]]
catbins = bins[4].str.split(',', expand=True).values.astype(np.float)[my_index] # movie categories in binary form (1 feature per category)
summaries =  df_data_1.iloc[:, [5]].values[my_index] # movie summaries (string)
corpus = summaries[:,0].tolist() # list form of summaries

- The **titles** matrix contains all the movie titles.
- The **categories** matrix contains the movie categories in string format. Eg: '"Tragedy",  "Indie",  "Punk rock",  "Addiction Drama",  "Cult",  "Musical",  "Drama",  "Biopic \[feature\]",  "Romantic drama",  "Romance Film",  "Biographical film"'. It is comma separated strings list, with each string representing a category.
- The **catbins** matrix contains the categories of the movies but in binary code ([one hot encoding](https://hackernoon.com/what-is-one-hot-encoding-why-and-when-do-you-have-to-use-it-e3c6186d008f)). Its dimensinon is 5.000 x 322 (as much as the categories). If the movie is of a certain category the matrix takes the 1 value in that position otherwise it takes zero .
- The **summaries** matrix and the **corpus** list contain all the summaries of the movies in string format. Eg: *'The film is based on the real story of a Soviet Internal Troops soldier who killed his entire unit  as a result of Dedovschina. The plot unfolds mostly on board of the prisoner transport rail car guarded by a unit of paramilitary conscripts.'*
- As **ID** for each movie we the consider its line number.

# Application: Movie recommender based on content
<img src="http://clture.org/wp-content/uploads/2015/12/Netflix-Streaming-End-of-Year-Posts.jpg" width="50%">

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(input='content',stop_words='english',max_df=0.02, min_df=0.005)
vectorizer.fit(corpus)
corpus_tf_idf = vectorizer.transform(corpus)

In [10]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity_calc=cosine_similarity(corpus_tf_idf[1995],corpus_tf_idf,dense_output=True)

In [11]:
indices = (-cosine_similarity_calc).argsort()
print indices
target_movie=1995
print ('Movie ID = 1995',titles[1995],summaries[1995],categories[1995])

[[1995 2138 2224 ..., 1748 1764 4999]]
('Movie ID = 1995', array(['Sonny'], dtype=object), array([ "Sonny  is the son of Jewel  who runs a small brothel in New Orleans, Louisiana. Sonny returns home from the army, staying with his mother while waiting to start the job an army buddy of his promised him. Jewel tries to convince Sonny to come back to working for her as he had before the army, saying many of his old clients still miss him and he was the best gigolo she had ever had. Sonny repeatedly turns her down, wanting to leave that life behind. However, the job he was promised never materializes and he is forced to return to working for his mother. Jewel had recently recruited a new girl to the brothel, Carol , who meets Sonny and falls in love with him. They talk of getting out together. One of Carol's clients, an older man, proposes to her. She initially declines, hoping to go away with Sonny. She and Sonny fall out as he fails to make an effort to get out of the business, instead b

In [12]:
from sklearn.externals import joblib
joblib.dump(corpus_tf_idf, 'corpus_tf_idf.pkl') 

['corpus_tf_idf.pkl']

In [13]:
def content_recommender(target_movie,max_recommendations):
    cosine_similarity_calc = cosine_similarity(corpus_tf_idf[target_movie],corpus_tf_idf,dense_output=True)
    indices = (-cosine_similarity_calc).argsort()
    
    print 'Target movie:',target_movie
    print 'Target movie:',target_movie,'\n','Title:',str(titles[target_movie]),'Synopsis:',str(summaries[target_movie]),'Categories:',str(categories[target_movie])
    print '\n'
    print '***',max_recommendations, 'Movies for you! '
    for i in range(1,(max_recommendations+1)):
        print i,'.'
        print 'Movie ID:',str(indices[:,i])
        print 'Title:', str(titles[indices[:,i]])
        print 'Synopsis:', str(summaries[indices[:,i]])
        print 'Categories:', str(categories[indices[:,i]])
    return
    

In [14]:
content_recommender(9,3)

Target movie: 9
Target movie: 9 
Title: ['Chicken Hawk: Men Who Love Boys'] Synopsis: [ "The film describes the organization and its history. The film attempts to provide context by discussing Ancient Greek pederasty. It presents a series of interviews with NAMBLA members. They describe their feelings regarding young boys and justifications for these relationships. A camera follows a group of NAMBLA members as they participate in the 1993 March on Washington for Lesbian, Gay and Bi Equal Rights and Liberation. They give reasons why NAMBLA should be included in the gay rights movement, to the disapproval of other attendees. According to a 2002  Intermountain Jewish News article, the July 8, 1994 edition of Newsday reported poet and free speech advocate Allen Ginsberg, NAMBLA's most famous member and defender,"] Categories: ['"Crime Fiction",  "Documentary"']


*** 3 Movies for you! 
1 .
Movie ID: [2508]
Title: [['The Butch Factor']]
Synopsis: [[ 'The documentary tackles meanings of masc

In [15]:
from sklearn.externals import joblib
joblib.dump(corpus_tf_idf, 'corpus_tf_idf.pkl') 

['corpus_tf_idf.pkl']

In [16]:
corpus_tf_idf = joblib.load('corpus_tf_idf.pkl')

In [17]:
!ls

Lab3_Classification_1.1.ipynb	corpus_tf_idf.pkl  test.ipynb
Lab_3_Classification_1.2.ipynb	lab2.ipynb


## Creating dataset for SOM training (unfinished code)
Using Self Organizing Maps (SOM) I will create a 2 dimensional grid with all the movies of our collection based on content and category.

In [18]:
def build_final_set(doc_limit = 5000):
    # convert sparse tf_idf to dense tf_idf representation
    dense_tf_idf = corpus_tf_idf.toarray()[0:doc_limit,:]
    final_set = np.hstack((dense_tf_idf, catbins[0:doc_limit,:]))
    return final_set

In [2]:
final_set = build_final_set()

NameError: global name 'corpus_tf_idf' is not defined

## SOM map trainnig


In [19]:
# install somoclu
!pip install somoclu
# import sompoclu, matplotlib
import somoclu
import matplotlib
# we will plot inside the notebook and not in separate window
%matplotlib inline



In [20]:
som=somoclu.Somoclu(10,10,None,0,'planar','rectangular',False,'gaussian',0.5,None)

In [35]:
final_set=build_final_set(2000)
train_set=som.train(final_set,100)




In [38]:
train_set_sur=som.get_surface_state(train_set)
bmus=som.get_bmus(train_set_sur)