# Welcome to the AIinAfrica Hackaton! 
## Cyberbullying Contest
### Introduction

We present an analysis of emotions linked to tweets in order to detect instances of cyberbulling.
The tweets dataset has been manually collected using twitter APIs by Margarita Bugueño, Fabián Fernandez and Francisco Mena.

The NRC Emotion Lexicon (aka Emolex) is a list of English words and their associations with eight basic emotions (anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments (negative and positive). It has been developed by [Saif Mohammad](https://saifmohammad.com/WebPages/NRC-Emotion-Lexicon.htm) and is a lexic tag based on the Plutchick wheel of emotions. The annotations were manually done by crowdsourcing.


## Exploratory Analysis
To begin this exploratory analysis, first import libraries and define functions for plotting the data using `matplotlib`. Depending on the data, not all plots will be made. (Hey, I'm just a simple kerneling bot, not a Kaggle Competitions Grandmaster!)

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt # plotting
import numpy as np # linear algebra
import os # accessing directory structure
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')


There are 44 csv files in the current version of the dataset:


In [None]:
print(os.listdir('data'))
print(os.listdir('data/tweets'))

In [None]:
# Previewing one file alyssalg93.csv 

tweets = pd.read_csv('data/tweets/alyssalg93.csv', delimiter=',')
tweets.dataframeName = 'alyssalg93.csv'
print(f'There are {tweets.shape[0]} rows and {tweets.shape[1]} columns')
tweets.head()

# Using the NRC Emoticon Lexicon
We will be using the NRC Emotion Lexicon for the sentiment analysis of the tweets.
The NRC Emotion Lexicon is a list of English words and their associations with eight basic e motions 
(anger, fear, anticipation, trust, surprise, sadness, joy, and disgust) and two sentiments 
(negative and positive). The annotations were manually done by crowdsourcing.

In [None]:
from nltk import word_tokenize
from nltk.stem.snowball import SnowballStemmer

from tqdm import tqdm_notebook as tqdm

stemmer = SnowballStemmer("english")


lexicon = "NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-Wordlevel-v0.92.txt"
emolex_df = pd.read_csv(lexicon,
                            names=["word", "emotion", "association"],
                            sep='\t')

emolex_df.dropna(subset=['word'], inplace=True)

emolex_words = emolex_df.pivot(index='word',
                                columns='emotion',
                                values='association')

emolex_words.index = emolex_words.index.map(lambda w: stemmer.stem(w.lower()) if w else nan)

emotions = emolex_words.columns.values

emolex_words['emotions'] = list(zip(emolex_words.anger, emolex_words.anticipation, emolex_words.disgust,
                                  emolex_words.fear, emolex_words.joy, emolex_words.negative, emolex_words.positive,
                                   emolex_words.sadness, emolex_words.surprise, emolex_words.trust))

# Convert into a dictionary for faster lookup
emolex_dict = emolex_words['emotions'].to_dict()



print("We built a dictionary of {} words associated to emotions".format(len(emolex_dict)))

In [None]:
# Run only once to import punctuation
import nltk
nltk.download('punkt')

In [None]:
# Functions to score the tweets
def text_emotion(df, column):
    '''
    Takes a DataFrame and a specified column of text and adds 10 columns
    for each of the 10 emotions in the NRC Emotion Lexicon, with each
    column containing the value of the text in that emotions and the counts of tweets
    INPUT: DataFrame, string
    OUTPUT: New DataFrame with ten new columns
    '''

    new_df = df.drop(['id', 'favorite count', 'retweet count', 'created at'], axis=1)
    new_df['document'] = pd.Series()
    new_df = new_df.reindex(columns=new_df.columns.tolist())
    # Convert to numpy array
    tweets = new_df.copy().to_numpy()
    scores = np.zeros((tweets.shape[0], len(emotions)))
    #print(scores.shape)


    with tqdm(total=new_df.shape[0]) as pbar:
        for i, text in enumerate(tweets[:, 0]):        
            pbar.update(1)
            #print("Iteration ",i)
            document = word_tokenize(text)
            tweets[i, 2] = document
            for w, word in enumerate(document):
                document[w] = stemmer.stem(word.lower())
                #emo_score = emolex_words[emolex_words.word == word].values
                emo_score = emolex_dict.get(word)
                if emo_score != None:
                    scores[i,:] += list(emo_score)
    
    tweets_df = pd.DataFrame(data=tweets, columns=new_df.columns)
    scores_df = pd.DataFrame(data=scores, columns=emotions)

    return pd.concat([tweets_df, scores_df], axis=1)  

Now we can read all the files and build one dataframe with the emolex scores from all the tweets in the directory

In [None]:
# Read all set of tweets and build sentiment dataframes
df_emo_all = pd.DataFrame()
for file in os.listdir('data/tweets'):  
    df = pd.read_csv('data/tweets/'+file, delimiter=',')
    df['screen_name'] = os.path.splitext(file)[0]
    print("Scoring tweets from ", os.path.splitext(file)[0])
    df_emo = text_emotion(df, 'text')
    df_emo_all = pd.concat([df_emo_all, df_emo])
    
df_emo_all.shape

In [None]:
df_emo_all.head()

In [None]:
# Aggregating Grouping data together
df_emotions = df_emo_all.groupby('screen_name')['anger','anticipation', 'disgust', 'fear', 'joy', 'sadness', 'surprise', 'trust','negative', 'positive'].mean()
df_emotions['n_tweets']=df_emo_all.screen_name.value_counts()
df_emotions.head()

## Exploratory Analysis

In [None]:
# Distribution graphs (histogram/bar graph) of column data
def plotPerColumnDistribution(df, nGraphShown, nGraphPerRow):
    nunique = df.nunique()
    df = df[[col for col in df if nunique[col] > 1 and nunique[col] < 50]] # For displaying purposes, pick columns that have between 1 and 50 unique values
    nRow, nCol = df.shape
    columnNames = list(df)
    nGraphRow = (nCol + nGraphPerRow - 1) / nGraphPerRow
    plt.figure(num = None, figsize = (6 * nGraphPerRow, 8 * nGraphRow), dpi = 80, facecolor = 'w', edgecolor = 'k')
    for i in range(min(nCol, nGraphShown)):
        plt.subplot(nGraphRow, nGraphPerRow, i + 1)
        columnDf = df.iloc[:, i]
        if (not np.issubdtype(type(columnDf.iloc[0]), np.number)):
            valueCounts = columnDf.value_counts()
            valueCounts.plot.bar()
        else:
            columnDf.hist()
        plt.ylabel('counts')
        plt.xticks(rotation = 90)
        plt.title(f'{columnNames[i]} (column {i})')
    plt.tight_layout(pad = 1.0, w_pad = 1.0, h_pad = 1.0)
    plt.show()


In [None]:
# Correlation matrix
def plotCorrelationMatrix(df, graphWidth):
    #filename = df.dataframeName
    df = df.dropna('columns') # drop columns with NaN
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    if df.shape[1] < 2:
        print(f'No correlation plots shown: The number of non-NaN or constant columns ({df.shape[1]}) is less than 2')
        return
    corr = df.corr()
    plt.figure(num=None, figsize=(graphWidth, graphWidth), dpi=80, facecolor='w', edgecolor='k')
    corrMat = plt.matshow(corr, fignum = 1)
    plt.xticks(range(len(corr.columns)), corr.columns, rotation=90)
    plt.yticks(range(len(corr.columns)), corr.columns)
    plt.gca().xaxis.tick_bottom()
    plt.colorbar(corrMat)
    #plt.title(f'Correlation Matrix for {filename}', fontsize=15)
    plt.title(f'Correlation Matrix', fontsize=15)
    plt.show()


In [None]:
# Scatter and density plots
def plotScatterMatrix(df, plotSize, textSize):
    df = df.select_dtypes(include =[np.number]) # keep only numerical columns
    # Remove rows and columns that would lead to df being singular
    df = df.dropna('columns')
    df = df[[col for col in df if df[col].nunique() > 1]] # keep columns where there are more than 1 unique values
    columnNames = list(df)
    if len(columnNames) > 10: # reduce the number of columns for matrix inversion of kernel density plots
        columnNames = columnNames[:10]
    df = df[columnNames]
    ax = pd.plotting.scatter_matrix(df, alpha=0.75, figsize=[plotSize, plotSize], diagonal='kde')
    corrs = df.corr().values
    for i, j in zip(*plt.np.triu_indices_from(ax, k = 1)):
        ax[i, j].annotate('Corr. coef = %.3f' % corrs[i, j], (0.8, 0.2), xycoords='axes fraction', ha='center', va='center', size=textSize)
    plt.suptitle('Scatter and Density Plot')
    plt.show()


Now you're ready to read in the data and use the plotting functions to visualize the data.

Distribution graphs (histogram/bar graph) of sampled columns:

In [None]:
plotPerColumnDistribution(df_emotions.drop(['n_tweets'], axis=1), 4, 4)
#plotPerColumnDistribution(df1, 10, 5)

Correlation matrix:

In [None]:
plotCorrelationMatrix(df_emotions.drop(['n_tweets'], axis=1), 6)

Scatter and density plots:

In [None]:
plotScatterMatrix(df_emotions.drop(['n_tweets'], axis=1), 18, 10)

In [None]:
df_emotions.head()

In [None]:
# How many bullies do we have??

df_emotions[df_emotions.anger > 0.3]

In [None]:
# and potentially bullied?
df_emotions[df_emotions.fear > 0.3]

In [None]:
#Top 10 angriest
df_emotions.sort_values(by='anger', ascending=False)[:10]

In [None]:
#Top 10 saddest
df_emotions.sort_values(by='sadness', ascending=False)[:10]

## Conclusion
So where from now? Can you build a predictive model based on tweets?

In [None]:
# Let's play with isiZulu

def build_emolex(language):
        
    lexicon = "NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-v0.92_"+language+".txt"
    emolex_words = pd.read_csv(lexicon,
                            index_col=0,
                            sep='\t', na_values='NO TRANSLATION')
    print(emolex_words.shape)
    #emolex_words.index = emolex_words.index.map(lambda w: stemmer.stem(w.lower()) if w else nan)

    emotions = emolex_words.columns.values
    emolex_words['emotions'] = list(zip(*map(emolex_words.get, emolex_words)))
    # Convert into a dictionary for faster lookup
    emolex_dict = emolex_words['emotions'].to_dict()
    print("We built a dictionary of {} words associated to {} emotions".format(len(emolex_dict), len(emotions)))
    return (emolex_dict, emotions)

In [None]:
language='isizulu'
isizulu = build_emolex('isizulu')

In [None]:
len(isizulu[1])

In [4]:
# define a function to score a sentence
from nltk import word_tokenize
def sentence_emotions(sentence, emo_dict):
    """
    Scores each word in a sentence and prints total emotional scores
    """
    sentence_score = np.zeros(10)
    emolex, emotions = emo_dict
    document = word_tokenize(sentence)
    for w, word in enumerate(document):
                #document[w] = stemmer.stem(word.lower())
                #emo_score = emolex_words[emolex_words.word == word].values
                emo_score = emolex.get(word)
                if emo_score != None:
                    sentence_score += emo_score
    print("Sentence scores:")
    for i in range(len(emotions)):
        print("{}: {}".format(emotions[i],sentence_score[i]))
    return

In [None]:
sentence_emotions("bulala ukudabuka", isizulu)

In [None]:
language='isizulu'
isizulu = build_emolex('isizulu')

In [None]:
language='xhosa'
sesotho, emotions = build_emolex(language)

In [None]:
language='sesotho'
sesotho, emotions = build_emolex(language)

In [None]:
sesotho.get('ho etsa lichelete')

In [None]:
language='sesotho'
lexicon = "NRC-Sentiment-Emotion-Lexicons/NRC-Emotion-Lexicon-v0.92/NRC-Emotion-Lexicon-v0.92_"+language+".txt"
emolex_words = pd.read_csv(lexicon,
                            index_col=0,
                            sep='\t')

emolex_words = emolex_words[emolex_words.index!='NO TRANSLATION']

print(emolex_words.shape)
#emolex_words.index = emolex_words.index.map(lambda w: stemmer.stem(w.lower()) if w else nan)

emotions = emolex_words.columns.values
emolex_words['emotions'] = list(zip(*map(emolex_words.get, emolex_words)))
emolex_dict = emolex_words['emotions'].to_dict()
print("We built a dictionary of {} words associated to {} emotions".format(len(emolex_dict), len(emotions)))


In [None]:
dict_set = set(emolex_words['emotions'].to_dict().keys())
emo_set = set(emolex_words.index)


## Examples of 

In [None]:
# We could develop a metric to define a "bully" with some tresholds

def is_a_bully(screen_name, df):
    """
    Test function to flag a user as a potential bully using the aggregated metrics
    
    """
    ### Your code here
    
    
    return False

In [None]:
# Can you write a scoring function that uses  bi-grams (a sequence of two words) and tri-grams (a sequence of three words)
# instead than scoring word by word?
# HINT: Google search for "generate bigrams nltk"

def bigram_sentence_emotions(sentence, emo_dict):
    """
    Scores single words and bigrams (sequences of two words) sentence and prints total emotional scores
    
    """
    sentence_score = np.zeros(10)
    emolex, emotions = emo_dict
    
    # Your code here
    

    print("Sentence scores:")
    for i in range(len(emotions)):
        print("{}: {}".format(emotions[i],sentence_score[i]))
    return

In [None]:
# Build a dictionary that works for "Joburg Zulu" or Scamto urban language by mixing english and zulu words
# and adding slang


