# Audience Demographics

In [1]:
# Import libraries
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import requests

In [2]:
#Query url
url = "https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews"

In [3]:
# Scrap data from all review pages available for college
content = []
while url:
    print url
    output = requests.get(url)
    soup = BeautifulSoup(output.content, "html.parser")
    next_pag_url = soup.find('ul', class_='pager').find('li', class_='next')
    if next_pag_url:
        url = next_pag_url.a['href']
    else:
        break
    review_content = soup.findAll("div", { "class" : "content_data" })
    content.extend(review_content)

https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-2
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-3
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-4
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-5
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-6
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-7
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-8
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/reviews/page-9
https://collegedunia.com/university/25787-lovely-professional-university-lpu-jalandhar/re

### Total Positive/Negative votes for the college

In [4]:
# Total likes and dislikes corresponding to each  
total_positive = 0
total_negative = 0
for rc in content:
    like = rc.find("a", {"class" : "liked reviews_like_button review_like"}).span.string
    total_positive += int(like)
    dislike = rc.find("a", {"class" : "disliked reviews_like_button review_like"}).span.string
    total_negative += int(dislike)

print "Total Positive votes :" + str(total_positive)
print "Total Negative votes :" + str(total_negative)

Total Positive votes :423
Total Negative votes :212


### Percentage of "review authors" in the following categories : “College students”, “Alumni"

In [5]:
# Calculate percentage of current students and alumni based on keywords available in the result
college_students = 0
alumni = 0
total_reviews = len(content)
unknown = 0
for rc in content:
    author_batch = rc.find("span", {"class" : "author_batch"})
    if author_batch:
        author_batch = author_batch.string
        if "Pursued" in author_batch :
            college_students += 1
        elif "Batch" in author_batch :
            alumni += 1
        else:
            unknown += 1
    else :
        unknown +=1
    
print "Percentage of College students :" + str(float(college_students)/float(total_reviews)*100)
print "Percentage of Alumni :" + str(float(alumni)/float(total_reviews)*100)
    

Percentage of College students :50.0
Percentage of Alumni :48.5365853659


### Percentage of "review authors" who are Male/Female

In [6]:
def get_gender(name):
    """
    This function take name of person and returns gender based on api
    https://github.com/appeler/clarifai_gender

    Parameters
    -----------
    name : string
        name of person
    Returns
    --------
    string: gender
    """
    p = 0
    gender = "male"
    for i in range(len(name.split(" "))):
        gender_url = "https://api.genderize.io/?name=" + name.split(" ")[i]
        output = requests.get(gender_url).json()
        if output.get("probability", 0) > p:
            p = output.get("probability", 0)
            gender = output['gender']

    return gender

In [7]:
# We can use already available machine learning solutions to classify names based on learning from existing data of we have any
# 
female = 0
male = 0
total_reviews = len(content)
for rc in content:
    author_name = rc.find("span", {"class" : "author_name"}).string
    gender = get_gender(author_name)
    if gender == "male" :
        male += 1
    elif gender == "female" :
        female += 1
    else:
        raise IOError("Invalid name")
print "Percentage of Male :" + str(float(male)/float(total_reviews)*100)
print "Percentage of female :" + str(float(female)/float(total_reviews)*100)
    


Percentage of Male :87.3170731707
Percentage of female :12.6829268293


# Entity Extraction

In [8]:
english_stop_words =  ["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","knows","known","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]

In [9]:
import re
import string

from nltk import word_tokenize, pos_tag, ne_chunk
from nltk.stem import WordNetLemmatizer

In [10]:
def remove_stop(text):
    'Remove stop word'
    return [x for x in text if x not in english_stop_words + [""]]


def clean_words(text):
    'standardization string: Removing punctuation,all letters lower case'
    text = text.replace("\n", " ")
    text = text.replace("\t", " ")
    text = text.replace("\r", " ")
    delEStr = string.punctuation + string.digits + "–|[：+——！，。？、~@#￥%……&*（）]"
    table = {ord(char): None for char in delEStr}
    text = text.translate(table)  # Remove punctuation and numbers
    text = text.lower()
    text = text.split(' ')
    return text

### Frequency of meaningful entities

In [11]:
lemmatizer=WordNetLemmatizer()
words_list = []
for rc in content:
    review = rc.find("div", {"class" : "review_content"})
#     print review
# print review.prettify()
    if not review.find("h2"):
        pass
    else:
        if not review.find("p"):
            review_string = review.h2.string
        else :
            b = review.p.contents
            review_string = b[0] + review.h2.string
#     print review_string
    try:
        review_string = clean_words(review_string)
        review_string = remove_stop(review_string)
        for word in review_string:
            words_list.append(lemmatizer.lemmatize(word))
    except:
        continue


**Meaningful entities**

In [12]:
# count of meaningful words
from collections import Counter
counts = Counter(words_list)
print counts

Counter({u'college': 252, u'good': 191, u'university': 126, u'student': 120, u'management': 102, u'lpu': 93, u'experience': 63, u'life': 59, u'placement': 56, u'campus': 49, u'india': 46, u'bad': 41, u'study': 39, u'thing': 32, u'lovely': 31, u'time': 30, u'facility': 28, u'lot': 28, u'professional': 27, u'infrastructure': 27, u'education': 24, u'faculty': 23, u'place': 22, u'company': 21, u'class': 21, u'system': 19, u'remark': 18, u'day': 18, u'excellent': 17, u'collage': 17, u'awesome': 17, u'food': 17, u'great': 17, u'exam': 15, u'provide': 14, u'people': 14, u'learn': 14, u'complain': 14, u'part': 14, u'activity': 14, u'security': 13, u'environment': 13, u'year': 13, u'average': 13, u'package': 13, u'problem': 13, u'give': 12, u'till': 12, u'btech': 12, u'opportunity': 11, u'fee': 11, u'exposure': 11, u'enjoy': 10, u'work': 10, u'hostel': 10, u'make': 10, u'mark': 10, u'strict': 10, u'field': 10, u'hard': 10, u'admission': 10, u'private': 10, u'high': 10, u'knowledge': 10, u'lakh'

### Percentage of reviews which suggest, students did not get placed

In [13]:
def get_review_based_on_index_distance(word_1, word_2, review):
    get_index_word_1 = review.index(word_1)
    get_index_word_2 = review.index(word_2)
    distance = abs(get_index_word_1 - get_index_word_2)
    if distance < 5:
        return 1
    else:
        return 0
        

In [14]:
lemmatizer=WordNetLemmatizer()
good_reviews = []
bad_reviews = []
total_reviews = len(content)
for rc in content:
    review = rc.find("div", {"class" : "review_content"})
#     print review
# print review.prettify()
    if not review.find("h2"):
        pass
    else:
        if not review.find("p"):
            review_string = review.h2.string
        else :
            b = review.p.contents
            review_string_input = b[0] + review.h2.string
#     print review_string
    try:
        review_string = clean_words(review_string_input)
        review_string = remove_stop(review_string)
        word_list2 = []
        for word in review_string:
            word_list2.append(lemmatizer.lemmatize(word))
        if "placement" in word_list2:
            if get_review_based_on_index_distance("bad", "placement", word_list2):
                bad_reviews.append(review_string_input)
            else:
                good_reviews.append(review_string_input)
    
        
    except:
        continue


In [15]:
bad_reviews

[u"Good remarks about the faculties, they teach very well\r\nBad remarks is only about the placement. It's only good in computer science,MBA and hotel management fields only.Reviews about placements",
 u'Good Thing - Teachers , Management , Extracurricular Activities , And even placements activities can be included\r\nBad - Ultra Tight Security and restrictions, No liberage for exams or study even if it is your serious family issues .\r\n\t\t\t\t\t\t\tOutstanding Experience',
 u'Not many words, but the placement experience was pretty bad with college. I did get to sit in few of the interviews but could not cash in opportunity for a job. I had to struggle upon my own in the end and got my first job after 1 year gap. This is the case with most of my fellow students. Those of whom got the opportunity to get a job through campus placement are doing good in life but it would have been a good experience if the university put in more efforts for the deserving students of the college.Average e

In [16]:
print "Percentage of bad reviews about placement : " + str(float(len(bad_reviews))/float(total_reviews)*100)

Percentage of bad reviews about placement : 0.975609756098
