# Word frequency analyze 

Some words like "dogs, gay and rubbish" is more frequent in low score comments. Although some comments are long and well strucured, but due to the insulting words, the score becomes low. So this code is analyzing the word frequency to see if the word frequency is different in different comments. 

## load data

In [1]:
import pandas as pd 
import numpy as np

In [2]:
path = '/Users/apple/Documents/GitHub/Argument-Scoring-System/comment_data/comments.csv'
data = pd.read_csv(path)
text = data['comment_text']
score = data['mean_evaluation']

In [3]:
# print (text)

## text cleaning

In [4]:
import re
import itertools
from collections import Counter

In [5]:
"""
Original taken from https://github.com/dennybritz/cnn-text-classification-tf
"""


def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

In [6]:
def clear_split_str(s):
    strip_s = s.strip()
    clear_s = clean_str(strip_s)
    s_text = clear_s.split(" ")
    return s_text

## clear text

In [7]:
# tokenize words
clean_text = [clear_split_str(sent) for sent in text]

In [8]:
print (clean_text[0])

['negative', 'gearing', 'may', 'have', 'seemed', 'like', 'a', 'good', 'idea', 'when', 'it', 'was', 'first', 'introduced', ',', 'but', 'it', 'has', 'become', 'little', 'more', 'than', 'a', 'massive', 'rort', 'the', 'only', 'reason', 'we', 'still', 'have', 'it', 'after', 'its', 'problems', 'have', 'been', 'pointed', 'out', 'so', 'often', 'is', 'that', 'politicians', 'are', 'under', 'the', 'thumb', 'of', 'the', '20', 'who', 'do', 'most', 'of', 'the', 'negative', 'gearing', 'and', 'have', 'the', 'power', 'to', 'scream', 'the', 'loudest', 'come', 'election', 'time', 'it', 'has', 'no', 'place', 'in', 'a', 'just', 'and', 'efficient', 'economic', 'system']


## dive data into four parts
* high score, low word count 
* low score, low word count
* high score, high word count
* high score, high word count

In [9]:
from collections import defaultdict

In [10]:
# seperate based on word count and score
# differences between different score
score_dic  = defaultdict(list)
for i in range(len(text)):
    score_dic[score[i]] = score_dic.get(score[i], []) + clean_text[i]

In [11]:
# set of words dictionary
score_word_dic = {}
for key in score_dic.keys():
    score_word_dic[key] = set(score_dic[key])

In [12]:
print (score_word_dic.keys())

dict_keys([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, nan, nan, 2.3333333333333304, nan, nan, 3.6666666666666696, nan, nan, 4.5, nan, nan, nan, nan, nan, nan, 6.3333333333333304, nan, nan, nan, nan, nan, nan, nan, nan, nan, 5.6666666666666696, nan, 6.6666666666666696, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 1.3333333333333299, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 3.3333333333333304, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, 4.3333333333333304, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,

In [13]:
# distinctive words for different score
dist_score_word_dic = {}
keys = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}
for key in keys:
    key_temp = keys - {key}
    value_temp = score_dic[key]
    dist_score_word_dic[key] = value_temp
    for v in key_temp:
        dist_score_word_dic[key] = list(set(dist_score_word_dic[key]) - set(score_dic[v]))
    

In [15]:
# print (dist_score_word_dic[5])

In [72]:
print (keys - {0.0})

{1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}


* There is no obvious features in words for different score.

## most common words in different score

In [16]:
from nltk import FreqDist

In [30]:
def freq_words(words, number):
    '''
    return the words which the frequency is higher than the given number
    number: the lowest value of frequency
    words: the cleared words list
    '''
    word_freq = FreqDist(words).most_common(1000)
    words_list = [key for (key, value) in word_freq if value > number]
    return words_list

In [39]:
fre_dict = {}
for key in score_dic.keys():
    fre_dict[key] = freq_words(score_dic[key], 5)

In [40]:
keys = {0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0}
distinc_freq_dict = {}
for key in keys:
    key_temp = keys - {key}
    value_temp = fre_dict[key]
    distinc_freq_dict[key] = value_temp
    for v in key_temp:
        distinc_freq_dict[key] = list(set(distinc_freq_dict[key]) - set(fre_dict[v]))

In [41]:
# show the dinstinctive words in for different score
for k in distinc_freq_dict:
    print (k)
    print (distinc_freq_dict[k])
    print ("------")

0.0
['nukes']
------
1.0
['abbot', 'father', 'voters', 'mothers', 'stupid', 'protected', 'says', 'stand', 'cant']
------
2.0
['liberals', 'trusted', 'student', 'biological']
------
3.0
['reporting', 'maternity', 'mixed', 'secure', 'line', 'tradition', 'mr', 'alternatives', 'speech', 'press', 'firstly', 'plans', 'days', 'gone', 'attention', 'leader', 'acceptance', 'solutions', 'federal', 'wear', 'campaign', 'learn', 'prime', 'minister', 'hours', 'hold', 'irrelevant', 'household', 'uses', 'mostly', 'earn', 'notion', 'divorce', 'throughout', 'performance', 'direction', 'defense', 'politician', 'tell', 'treated', '7', 'personally', 'main', 'yourself', 'highest']
------
4.0
['success', 'distribution', 'implement', 'exam', 'substances', 'discriminate', 'dole', 'offend', 'period', 'depends', 'pass', 'camps', 'medium', 'usage', 'competitive', 'daily', 'deaths', 'told', 'goal', 'stay', 'teach', 'according', 'childcare', 'houston', 'university', 'belief', 'leaving', 'unproductive', 'payment', 's

In [43]:
# words for different score
# for k in fre_dict:
#     print (k)
#     print (fre_dict[k])
#     print ("-----")

# Findings
* There is no findings for 