# Perception of Minority Groups in Switzerland 
## Analysis Part 2: Keyword Analysis
From the 'Comment Analysis' section we got all the keywords from the full set of comments. Now we will get a frequency count of the keywords in order to see what topics are being discussed within each category.

In [7]:
import json
import time

from collections import Counter
from csv import reader

def keyword_counter(input_file):
    with open (input_file) as read_obj:
        csv_reader = reader(read_obj)

        all_words = []
        for row in csv_reader:
            for word in row:
                all_words.append(word)

    my_counter = Counter(all_words)
    most_occur = my_counter.most_common(20)
    print(most_occur)
    return(most_occur)

In [8]:
jewish_keyword_count = keyword_counter('keyword_jewish_extracted.csv')

[('', 242), (" 'switzerland'", 33), (" 'people'", 32), (" 'jews'", 20), (" 'know'", 19), (" 'jewish'", 19), (" 'think'", 18), (" 'religion'", 18), (" 'anti semitism'", 15), (" 'israel'", 13), (" 'study'", 13), (" 'lot'", 13), (" 'well'", 13), (" 'antisemitism'", 12), (" 'one'", 11), (" 'see'", 11), (" 'way'", 10), (" 'experience'", 10), (" 'country'", 9), (" 'say'", 9)]


In [9]:
muslim_keyword_count = keyword_counter('keyword_muslim_extracted.csv')

[('', 831), (" 'switzerland'", 83), (" 'people'", 77), (" 'women'", 73), (" 'religion'", 70), (" 'want'", 55), (" 'wear'", 52), (" 'think'", 51), (" 'muslims'", 44), (" 'know'", 43), (" 'would'", 41), (" 'vote'", 40), (" 'islam'", 36), (" 'live'", 33), (" 'law'", 33), (" 'one'", 31), (" 'initiative'", 31), (" 'svp'", 30), (" 'face'", 30), (" 'see'", 29)]


In [10]:
black_people_keyword_count = keyword_counter('keyword_black_extracted.csv')

[('', 2326), (" 'people'", 302), (" 'switzerland'", 286), (" 'think'", 213), (" 'racism'", 202), (" 'racist'", 171), (" 'want'", 156), (" 'know'", 153), (" 'lot'", 148), (" 'swiss'", 119), (" 'one'", 112), (" 'country'", 111), (" 'see'", 110), (" 'get'", 107), (" 'way'", 107), (" 'like'", 106), (" 'make'", 101), (" 'would'", 99), (" 'name'", 91), (" 'time'", 91)]


## Creating Bar Graphs

In [None]:
import matplotlib.pyplot as plt

word_list = []
word_freq = []

for topic in [jewish_keyword_count, muslim_keyword_count, black_people_keyword_count]:
    for my_tuple in topic:
        res = ""
        for ch in my_tuple[0]:
            if ch.isalpha():
                res+=ch
        word_list.append(res)
        word_freq.append(my_tuple[1])

    word_list.pop(0)
    word_freq.pop(0)

    plt.figure(1.5, [15, 8]) # this line moved and figure size changed to suit data
    plt.title(label="Top 20 Keyword Count")
    plt.ylabel("Word Count")
    plt.xlabel("Keyword")
    plt.bar(word_list, word_freq)
    plt.xticks(
        rotation=45, 
        fontweight='light',
    )
    plt.tight_layout()

## Phrase Scoring with RAKE
Another method to see what is being discussed is to use RAKE library to get the top scored phrases. We attempted to do this, but since you can only provide one stopword dictionary, and some of the comments contained German or Swiss German, automated cleaning to remove all these comments was not so straightforward, so the results here are not so concise, however it can still give an idea of the topics being discussed in the forum for each category.

In [64]:
import json
import time
import RAKE
from csv import writer, reader
from googleapiclient import discovery

stop_dir = 'SmartStoplist.txt'
rake_obj = RAKE.Rake(stop_dir)

def keyword_analysis(input_file):
    with open (input_file) as read_obj:
        csv_reader = reader(read_obj)

        # Skip header
        next(csv_reader)
        
        all_comments = ' '
        for row in csv_reader:
            comment = row[4]
            all_comments = all_comments + '. ' + comment

    keywords = sort_tuple(rake_obj.run(all_comments))[-50:]
    print(keywords)
    return(keywords)

def sort_tuple(tuple):
    tuple.sort(key=lambda x: x[1])
    return tuple