## 10. sentiment heatmap
Plot heatmap of user sentiment after n number of interactions and statistical test for every number of interactions

**input**  :pickle file at /obj folder  
**output** :plot of sentiment heatmap, statistical test result

**note that this jupyter notebook contains alternative solution if too few of conversations are found in given zips**

In [None]:
import numpy as np
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
from matplotlib.colors import LinearSegmentedColormap

In [None]:
airlines = {56377143: 'KLM', 106062176: 'Air France', 18332190: 'British Airways', 22536055: 'American Airlines', 
            124476322: 'Lufthansa', 26223583: 'Air Berlin', 2182373406: 'Air Berlin Assist', 38676903: 'easyJet', 
            1542862735: 'Ryanair', 253340062: 'Singapore Airlines', 218730857: 'Qantas',
            45621423: 'Etihad Airways', 20626359: 'Virgin Atlantic'}

In [None]:
# Extracts the conversations list from a pickle file in the obj folder
conversations = pickle.load(open("obj/conversations_with_scores.p", "rb"))

In [None]:
def CountAirline(airline, conversations):
    '''in: airline_id (integer), conversations (list)
      out: count of appearances (integer)
      
    This function counts how many times an airline appears in the conversations list.
    '''
    count = 0

    for conversation in conversations:
        for tweet in conversation:
            if tweet[1] == airline:
                count +=1
    return count

In [None]:
# In this cell we check the maximum number of appearances of which airline in the conversations list and AA's count
max_count, airline_max = 0, 0
aa_count = CountAirline(22536055, conversations)

for air_id, air_name in airlines.items():
    count = CountAirline(air_id, conversations)
    if count > max_count:
        max_count = count
        airline_max = air_id

In [None]:
# In this cell we give you an alternative if AA's count is 0, or significantly lower than the others'.
# That is because we can not do the visuals if AA's count is 0.
if aa_count == 0:
    print("American Airlines does not appear in any of the conversations in the given Json files, so the code and "\
          "visualizations below will not work. Therefore, we use another airline to do the visualizations. "\
          "The airline that appears most in the given Json files is {}, with {} appearances.".format(airlines[airline_max],
                                                                                                     max_count))
    airline_to_be_analyzed = airline_max

elif airline_max == 22536055:
    print("American Airlines has the most appearances in the conversations from the Json files, so it will "\
          "be analyzed below.")
    airline_to_be_analyzed = 22536055
    
else:
    analyze_AA = input("American Airlines has {} appearances. The airline with most appearances ({}) is {}. It might "\
                       "be more interesting to analyze the second airline because it may have way more appearances. "\
                       "Do you still want to analyze American Airlines? Type Y and hit enter for 'yes' or type N and hit "\
                       "enter for 'no': "\
                       .format(aa_count, max_count, airlines[airline_max]))
    if analyze_AA in 'yesYesÝý':
        airline_to_be_analyzed = 22536055
    else:
        airline_to_be_analyzed = airline_max

In [None]:
def MakeHeatmap(conversations, airline):
    '''in: a list of completed filtered conversations with sentiment scores, user_id of an airline (integer)
      out: three dataframes, of which the first is used for plotting the heatmap and the other two for statistics
      
    This function turns a list of completed filtered conversations with sentiment scores into three dataframes:
    one that can be used to plot a heatmap and the other two to calculate statistics and do hypotheses.
    
    The heatmap represents the influence airlines interacting with customers have on the customers' sentiment.
    '''
    categories = np.array([1,2,3,4,5])
    
    cutoff = 6
    responses = np.arange(0, cutoff+1)
    
    score_dataframe = pd.DataFrame(index = categories, columns = responses)
    second_dataframe = pd.DataFrame(index = categories, columns = responses)

    for conversation in conversations:
        conversation = conversation[::-1]
        
        # Put the first tweet in one of the five categories
        category = (conversation[0][4] < -0.6) + (conversation[0][4] < -0.2) + (conversation[0][4] < 0.2) + (conversation[0][4] < 0.6) + 1
        begin_score = np.around(conversation[0][4], 1)

        airline_count = 0
        airline_found = False

        # This forloop adds either a score or 1 to data in the DataFrames
        for tweet in conversation:
            # If an airline responds
            if tweet[1] == airline:
                if not airline_found:
                    if format(score_dataframe.loc[category, 0]) == 'nan':
                        score_dataframe.loc[category, 0] = [begin_score, 1]
                        second_dataframe.loc[category, 0] = [begin_score]
                    else:
                        score_dataframe.loc[category, 0][0] += begin_score
                        score_dataframe.loc[category, 0][1] += 1
                        second_dataframe.loc[category, 0].append(begin_score)
                    airline_count += 1
                    airline_found = True
                    
                elif airline_found:
                    if airline_count < cutoff:
                        airline_count += 1
            
            # If a person responds
            elif tweet[1] != airline and airline_found:
                score = tweet[4]
                if format(score_dataframe.loc[category, airline_count]) == 'nan':
                    score_dataframe.loc[category, airline_count] = [score, 1]
                    second_dataframe.loc[category, airline_count] = [score]
                else:
                    score_dataframe.loc[category, airline_count][0] += score
                    score_dataframe.loc[category, airline_count][1] += 1
                    second_dataframe.loc[category, airline_count].append(score)
                    
    scores_with_weights = score_dataframe.copy()
    second_dataframe_lists = second_dataframe.copy()
    
    # For each category, calculate some statistics
    for category in categories:
        for response in responses:
            if format(score_dataframe.loc[category, response]) != 'nan':
                score_dataframe.loc[category, response] = float(score_dataframe.loc[category, response][0]/score_dataframe.loc[category, response][1])
                second_dataframe.loc[category, response] = np.array(second_dataframe.loc[category, response]).std()
               
    # Debugging a very nasty error
    second_dataframe = second_dataframe.astype(np.float64)                
    score_dataframe = score_dataframe.astype(np.float64)
    
    return score_dataframe, scores_with_weights, second_dataframe_lists

In [None]:
# In this cell we define the color gradient for the heatmap.
cdict = {'red':   ((0.00, (254/255), (254/255)),
                   (0.25, (254/255), (254/255)),
                   (0.40, (254/255), (254/255)),
                   (0.50, (254/255), (254/255)),
                   (0.60, (154/255), (154/255)),
                   (0.75, (154/255), (154/255)),
                   (1.00, 0, 0)),

         'green': ((0.00, 0, 0),
                   (0.25, (165/255), (165/255)),
                   (0.40, (165/255), (165/255)),
                   (0.50, (215/255), (215/255)),
                   (0.60, (205/255), (205/255)),
                   (0.75, (205/255), (205/255)),
                   (1.00, (128/255), (128/255))),

         'blue':  ((0.00, 0, 0),
                   (0.25, 0, 0),
                   (0.40, 0, 0),
                   (0.50, 0, 0),
                   (0.60, (50/255), (50/255)),
                   (0.75, (50/255), (50/255)),
                   (1.00, (1/255), (1/255))),
        }

test_cmap = LinearSegmentedColormap('Test', cdict)
plt.register_cmap(cmap=test_cmap)

In [None]:
def PlotHeatmap(score_dataframe, airline):
    '''in: a dataframe which is very plottable as a heatmap, user_id of an airline (integer)
    
    Plots the heatmap.
    '''
    airline_name = airlines[airline]
    plt.figure(figsize=(12,8))
    
    with sns.axes_style("white"):
        sns.heatmap(score_dataframe, cmap='Test', mask=score_dataframe.isnull(),
                    cbar_kws={"ticks":[-0.73, -0.45, -0.2, 0, 0.2, 0.45, 0.74]}, vmin=-0.731065, vmax=0.740667)
        
        plt.yticks(rotation=0)
        plt.suptitle('Tweet sentiment by airline and influence of {} on sentiment of customers'.format(airline_name), size=20)
        plt.title('Each cell represents the average final sentiment', size=15)
        plt.xlabel('Average sentiment after n number of interactions by {}'.format(airline_name), size=15)
        plt.ylabel('Sentiment score of the first tweet in the conversation', size=15)
        
    plt.show()

In [None]:
score_dataframe, scores_with_weights, second_dataframe_lists = MakeHeatmap(conversations, airline_to_be_analyzed)

In [None]:
PlotHeatmap(score_dataframe, airline_to_be_analyzed)

In [None]:
# This cell calculates the standard deviation (of the sentiment after the interaction) for
# every number of interactions
standard_deviations = []
total_lists = {}

for column_number in list(second_dataframe_lists):
    total_list = []
    
    for partial_list in second_dataframe_lists[:][column_number]:
        if format(partial_list) != 'nan':
            total_list += partial_list
        
    total_lists[column_number] = total_list
        
    standard_deviations.append(np.array(total_list).std())

In [None]:
# This cell collects some statistics for every number of interactions an airline did in the following order:
# interactions, observations, average (of the sentiment after the interaction), 
# standard deviation (of the sentiment after the interaction)
statistics = []

for i in range(7):
    try:
        stats = [i]
        score_count = [0, 0]

        for score in scores_with_weights.loc[:, i]:
            if type(score) == list:
                score_count[0] += score[0]
                score_count[1] += score[1]

        stats.append(score_count[1])
        stats.append(score_count[0]/score_count[1])
        stats.append(standard_deviations[i])

        statistics.append(stats)
        print('{:>2} interactions: {:>5} instances, average = {:>7.4f}, standard deviation = {:.3f}'.format(stats[0], stats[1],
                                                                                                            stats[2], stats[3]))
    except:
        print('No data')

In [None]:
def ttest(x, y):
    '''in: x and y (integers), where x and y are the amount of interactions you want to compare (and x > y)
      out: the outcome of the t-test
    '''
    try:
        data_x = statistics[x]
        data_y = statistics[y]
        
        t = (data_x[2]-data_y[2])/((data_x[3]**2/data_x[1])+(data_y[3]**2/data_y[1]))**0.5
        df = ((data_x[3]**2/data_x[1])+(data_y[3]**2/data_y[1]))**2/(((data_x[3]**2/data_x[1])**2/(data_x[1]-1))+((data_y[3]**2/data_y[1])**2/(data_y[1]-1)))
        p = 1-st.t.cdf(t, df, loc=0, scale=1)
        if p<0.05:
            rejected = 'rejected'
        else:
            rejected = 'not rejected'
        print('H0: mu{0} = mu{1}\tHa: mu{0} > mu{1}\nt = {2:>6.3f}, p = {3:.4f}, so H0 is {4}\n'.format(data_x[0], data_y[0],
                                                                                                        t, p, rejected))
    except:
        print('No or too little data')

In [None]:
# Various t-tests
ttest(1, 0)
ttest(2, 0)
ttest(2, 1)
ttest(3, 0)
ttest(3, 2)
ttest(4, 2)
ttest(4, 3)

In [None]:
print('Done')