# Set up the connection to MongoDB

In [1]:
import nltk
import requests
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from src.config import *
from src.mongohandler import *
from src import recommender

from bson.son import SON
from IPython.display import display, HTML

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cosine_distances
from sklearn.metrics.pairwise import euclidean_distances as euclidean_distances

# IMPORTANT : Remember to start mongodb server if you are running a local version of this project
# run $ sudo systemctl start mongod
# Also, start the api.py Flask server
#     $ python3 api.py

Connected to MongoClient...  


In [None]:
print(flask_api)

In [None]:
nltk.download('vader_lexicon')

# Trying out our basic functions

To interact with the Messenger API we will use the python built-in `requests` module.

These aditionally, we will leverage on MongoDB as a database to store all relevant data for the `users` `chats` and `messages`. To do this, a `mongohandler` module with functions has also been developed and imported.

### To begin, lets use some scripts from the TV Show `Rick and Morty` to seed out API.

In [None]:
# Found this dataset on Kaggle, data can be updated by scrapping this wiki:
#  -  https://rickandmorty.fandom.com/wiki/Category:Transcripts

original_df = pd.read_csv('INPUT/RickAndMortyScripts.csv')
df = original_df
df.head(3)

### 1 - Create Users:
Endpoint: `/user/create/<username>`

In [None]:
usernames = [e for e in df.name.value_counts().index]
print(usernames[:8])

In [None]:
def import_usernames(usernames):
    for user in usernames:
        route = f'/user/create/{user}'
        res = requests.get(flask_api+route)
    #returns only the last response
    return res

last_response = import_usernames(usernames)
print(f'We have tried to create {len(usernames)} users.')
print("Last recorded event : ")    
display(HTML(last_response.text)) # Last user created    

### 2 - Create Public Chats
To do this, we will group the df by episode name, and use that variable as the `chat_title` to send to the API. Also, this group by allows us to see a list of the characters present on each episode.

Use this as an index to later create the group with the right members


In [None]:
chats_outline = df.groupby(['episode name', 'name']).agg({'line':'count'})
display(chats_outline.head(6))

Also, create a list of all the episodes we are going to upload

In [None]:
all_chat_titles = df['episode name'].drop_duplicates().values
all_chat_titles

#### Sending a request to the API, for every available `chat_tile & usernames` group

In [None]:
def create_chat(chat_title, participants):
    route = f"/chat/create?title={chat_title}&users={participants}"
    res = requests.get(flask_api+route)
    return res

In [None]:
for chat_title in all_chat_titles:
    participants = list(chats_outline.loc[chat_title].index) # These are the chat_titles and usernames
    res = create_chat(chat_title, participants)

    
print("Last recorded event's http response: ")    
display(HTML(res.text))

### 3 - Add New Users to an existing chat
To do this, I have create a new dummy user `rihp`, and added it to the `get_schwifty` chat.

In [None]:
def add_user_to_chat(username, chat_title):
    route = f"/chat/{chat_title}/adduser?&username={username}"
    res = requests.get(flask_api+route)
    return res

In [None]:
username=['rihp']
chat_title = "get_schwifty"

res = import_usernames(username)
display(HTML(res.text))
print('------------------------------------------------------')
res = add_user_to_chat(*username, chat_title)
display(HTML(res.text))

### 4 - Add Messages to a public chat
Using our dataset, we will use the following columns to populate our collection of `messages`

#### To send a message, the user must be part of the chat
This function is buggy and prints a lot of verbose.
Please fix 

`check_user_in_chat('rihp', 'pickle_rick')`

In [None]:
m_outline = df[['episode name', 'name', 'line']]
m_outline.head(3)

### The execution of these cells is not very efficient
they send all the messages to the database

In [None]:
def send_message(username, chat_title, text):
    route = f"/chat/{chat_title}/addmessage?username={username}&text={text}"
    res = requests.get(flask_api+route)
    return res

In [None]:
for i in m_outline.index:
    message = m_outline.iloc[i]                         # Locate the message
    chat_title = message['episode name']                # Define the episode name
    username = message['name']                          # Define the username  
    text = message['line']                              # Define the message text
    res  = send_message(username, chat_title, text)# <----SEND MESSAGE TO API------
print("Last recorded event (The http response): ")    
display(HTML(res.text))

### 5 - Query all the messages sent to an specific chat
- (GET) `/chat/<chat_id>/list`

Aggregate the messages in the chat, using this pipeline query on our Mongo Database

In [None]:
CHATSquery = get_CHATSquery()

### Access the specific chat and display some of the messages
This is not the specific chat, its only the first one in the list. fix this

In [None]:
all_chat_titles = list(enumerate([doc['title'] for doc in CHATSquery]))
all_chat_titles

### Access the first 5 lines of the `Pickle Rick` episode, index `8`

In [None]:
for i in range(5):
    print(CHATSquery[8]['messages'][i]['text']) 

### 6 - Perform a Sentiment analysis on an specific chat message history
- (GET) `/chat/<chat_id>/sentiment`

In this case, we will start analyzing Rick and Morty's (Season 1, Episode 5), `Get Schwifty`.

In [None]:
# This function takes in a pymongo CHATSquery cursor
# which has already been turned into a list
# and looks for an specific chat title.

def get_chat_doc(chat_title, query):
    for i in range(len(query)):
        if query[i]['title'] == chat_title:
            yield query[i]

#### Set the chat_title variable

In [None]:
episode = 'Get Schwifty'
chat_title = no_spaces(episode).lower()
chat_messages = list(get_chat_doc(chat_title, CHATSquery))[0]['messages']

In [None]:
check_original_data = False
if check_original_data:
    if len(chat_messages) != df['episode name'].value_counts()[episode]: 
        raise Exception('something is wrong here, the database has more messages than it should have')

#### Begin the sentiment analysis of that chat room

In [None]:
sia = SentimentIntensityAnalyzer()

In [None]:
def analyze_chat_sentiment(chat_messages):
    for i in range(len(chat_messages)):
        text = chat_messages[i]['text']
        yield sia.polarity_scores(text) # Analyze this

In [None]:
print(f"Sentiment analysis for the chat: \n {chat_title}")
sentiment = pd.DataFrame(list(analyze_chat_sentiment(chat_messages)))
display(sentiment.describe())

## Accessing the API endpoint

In [None]:
def get_chat_sia(chat_title):
    route = f"/chat/{chat_title}/sentiment"
    res = requests.get(flask_api+route)
    return res

In [None]:
res = get_chat_sia('pickle_rick')

In [None]:
print(f" Number of lines avaliable to analyze: {len(res.json()['compound'])}")
print(res.json().keys())

#### Get the mean compound for all the chats

In [None]:
def get_chats_sia_scores(all_chat_titles, query):
    overall_scores = {}
    for episode_num, chat_title in all_chat_titles:
        chat_messages = list(get_chat_doc(chat_title, query))[0]['messages']
        sentiment = pd.DataFrame(list(analyze_chat_sentiment(chat_messages)))
        overall_scores.update({chat_title:sentiment.describe().loc['mean']['compound']})
    return pd.DataFrame.from_dict(overall_scores, orient='index', columns=['mean_compound_score'])

In [None]:
all_chats_sia_scores = get_chats_sia_scores(all_chat_titles, CHATSquery)
plt.figure(figsize=(15,5))
plt.ylabel('EPISODES')
plt.xlabel('SENTIMENT INTENSITY SCORE')
plt.title('RICK AND MORTY SENTIMENT INTENSITY MEAN COMPOUND SCORE PER EPISODE')
plt.barh(all_chats_sia_scores.index,all_chats_sia_scores.mean_compound_score )

### 7 - Query all the messages from an specific user, and perform SIA

####  Use a query, find a username and create an iterator that includes all the messages from that username

In [None]:
USERSquery = get_USERSquery()
query = USERSquery

In [None]:
def iter_messages_from_user(query, username):
    for user_doc in query:                               # Exploring the query
        if user_doc['_id'] == get_user_id(username):     # Finding an specific user id   
            for message in user_doc['all_messages']:     # Looping through the messages
                yield message['text']                    # Analyze these raw strings

In [None]:
username = 'Morty'

In [None]:
all_user_messages = list(iter_messages_from_user(query, username))   # As an array of strings
all_user_messages[:2]

#### Take an array of strings and create an iterator of SIA scores

In [None]:
def iter_sia_scores(array_of_strings):
    for i in range(len(array_of_strings)):
        text = array_of_strings[i]            # Define the raw string to be analyzed
        yield sia.polarity_scores(text)       # Yield the results of the Polarity Score for each message

In [None]:
user_scores = list(iter_sia_scores(all_user_messages))
user_scores[:2]

#### Pass a list of sia scores, and return a pandas dataframe

In [None]:
print(f"Describing Sentiment Intensity Analysis for this user:\n {username}") 
to_df = lambda x : pd.DataFrame(x)
to_df(user_scores).describe()

In [None]:
def compare_user_sia(query):
    for doc in query:
        username = doc['username']

        # Check the username and how many messsages have been sent
        #print(username, len(doc['all_messages']))
        if len(doc['all_messages']) == 0: continue

        # Create an iterator of messages from the user described above,
        all_user_messages = list(iter_messages_from_user(query, username))
        
        # And turn them into Sentiment Intensity Analysis scores.
        user_scores = list(iter_sia_scores(all_user_messages))
        compound_mean = to_df(user_scores)['compound'].mean()

        yield username, to_df(user_scores)['compound'].mean()
        
        
        
sia_vectors = list(compare_user_sia(query))

In [None]:
df = to_df(sia_vectors)
df = df.sort_values(by=1)
plt.figure (figsize=(11,11))
plt.xlabel('MEAN COMPOUND SENTIMENT INTENSITY ANALYSIS ')
plt.ylabel('USERNAME')
plt.barh(df[0], df[1])

### 8 - Recommender system takes in an `user_id` and return top-3 similar `users`
- (GET) `/user/<user_id>/recommend`

### Prepare a dict: { 'rick':'all their messages joined in a single string'}


In [None]:
rick = 'rick'
morty = 'morty'
beth = 'beth'
jerry = 'jerry'

In [None]:
USERSquery = get_USERSquery()
usernames = [e['username'] for e in USERSquery]

In [None]:
print(usernames [:10])

### All messages as a single string
This produces an error with the `'` key, as it's showing as a backslash that could probably alter the meaning of the strings 

In [None]:
raw_corpus = lambda query, username : " ".join(list(iter_messages_from_user(query, username)))   

docs = {
    rick : raw_corpus(query, rick),
    morty: raw_corpus(query, morty),
    beth:  raw_corpus(query, beth),
    jerry: raw_corpus(query, jerry),
}

In [None]:
docs = {}
for username in usernames:
    username = no_spaces(username).lower()
    docs.update({f"{username}":f"{raw_corpus(query, username)}"})

In [None]:
count_vectorizer = CountVectorizer()
sparse_matrix = count_vectorizer.fit_transform(docs.values())
print(list(count_vectorizer.vocabulary_.keys())[:20])
m = sparse_matrix.todense()
print(m.shape)
print(m[0])

In [None]:
doc_term_matrix = sparse_matrix.todense()
df = pd.DataFrame(doc_term_matrix, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=docs.keys())
display(df.tail())

#### Query specific words to explore the data for more human similarities

In [None]:
my_words = ["youtube", "dimension", "me", "my", "mine", "you", "your", "their", "them", "friend", "enemy", ]
display(df[my_words].sort_values(by=my_words, ascending=False).head())

In [None]:
#cosine_distances
similarity_matrix = euclidean_distances(df,df)
print(similarity_matrix)

In [None]:
sim_df = pd.DataFrame(similarity_matrix, columns=docs.keys(), index=docs.keys())
display(sim_df)

In [None]:
plt.figure(figsize=(15,15))
sns.heatmap(sim_df,annot=False)

#### The 3 users that are most similar to the user 'RICK'

In [None]:
similar_to_rick = sim_df['rick']
display(similar_to_rick.sort_values(ascending=True).iloc[1:4])

In [4]:
 
recommender.most_similar_users('rick', top=3)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.72 µs


'{"morty":441.5155716393,"jerry":523.0793438858,"beth":530.7042490879}'

In [None]:
USERSquery =query
[e['username'] for e in USERSquery]

### 9 - Move local database to MongoAtlas cloud

### 10 - Prepare Docker Image

### 11  - Deploy Docker Image in Heroku

In [None]:
# Some basic checking:
Is there a chat 